'''
Author: LHY 2601958675@qq.com
Date: 2023-06-29 18:07:26
LastEditors: LHY
LastEditTime: 2023-12-06 17:44:45
Description: 
'''





import requests
import re
f = open('top250.csv',mode='a',encoding='utf-8')

def pyDB(page):
    url = f"https://movie.douban.com/top250?start={page*25}&filter="

    headers = {"Referer":"https://movie.douban.com/","User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"}
    respon = requests.get(url,headers=headers)
    responTetx = respon.text
# print(respon.text)
 
    obj = re.compile(r'<div class="item">(.*?)<img width="100" alt=(.*?)src="(?P<image>.*?)" class="">'
                     r'(.*?)<span class="title">(?P<name>.*?)</sp'
                 r'an>(.*?)<p class="">.*?导演:(?P<author>.*?)&nbsp;(.*?)主演:(?P<mainRole>.*?)..<br>'
                 r'(?P<year>.*?)&nbsp;/&nbsp;(?P<country>.*?)&nbsp;/&nbsp;(?P<category>.*?)</p>'
                 r'(.*?)<span class="rating_num" property="v:average">(?P<score>.*?)</span>',re.S)#re.S可以让.匹配换行
    res = obj.finditer(responTetx)

    for item in res:
        name = item.group("name").strip()
        author = item.group("author").strip()
        mainRole = item.group("mainRole").strip()
        year = item.group("year").strip() # 去掉字符串左右两端空白
        country = item.group("country").strip()
        category = item.group("category").strip()
        score = item.group("score").strip()
        image = item.group("image").strip()

        content = f'请点击这里跳转到<a href={image}>示例网站</a>。'
        html_content = f'<!DOCTYPE html><html><body>{content}</body></html>'
        print(name)
        print(author)
        print(mainRole)
        print(year)
        print(country)
        print(category)
        f.write(f'{name},{author},{mainRole},{year},{country},{category},{score},{html_content}\n')#csv模块也可以写入

    # ?start=50&filter=  翻页请求数据  （p-1）* 25
    # f.close()
    respon.close()
    print('done')


for i in range(5):
    pyDB(i+1)
f.close()
